#coding:utf-8
# @Time    : 16/12/19 上午9:39
# @Author  : Minjie Chen

import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import requests
import Queue
import math
import time
import threading
import numpy
import re
import matplotlib.pyplot as plt
import jieba.posseg as pseg
from pymongo import MongoClient
from bs4 import BeautifulSoup
from PIL import Image
from wordcloud import WordCloud, ImageColorGenerator


#待请求的url队列
URL_QUEUE = Queue.Queue()
#职位详情页面url队列
DETAIL_URL_QUEUE = Queue.Queue()
#待解析的JSON数据
JSON_LIST = []
#待解析的HTML页面数据
DETAIL_HTML_LIST = []
#需要爬去的城市
#WORK_CITY = ['北京', '上海', '广州', '深圳', '杭州', '成都', '南京', '武汉', '苏州', '厦门']
WORK_CITY = ['苏州']
#请求失败的url
FAILED_URL_LIST = []
#过滤的词语
INVALID_WORD = [ u'没有',u'什么', u'如果', u'因为', u'可以', u'怎么', u'我们', u'知道', u'答案',
                u'现在', u'可是', u'能力', u'的话', u'他们', u'现在', u'这里', u'哪里',
                u'那里', u'可能', u'Python', u'就是', u'那个', u'这样', u'你们', u'这个',
                u'已经', u'时候', u'不是', u'但是', u'所以', u'好像', u'python', u'真是',
                u'那么', u'一个', u'还有', u'职位', u'不过', u'这么', u'一起', u'大家',
                u'而且', u'不会', u'一定', u'公司', u'问题', u'企业', u'互联网', u'and',
                u'环境', u'优先']
#替换Cookie即可
HEADERS = {
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko)'
                  ' Chrome/54.0.2840.98 Safari/537.36',
    'X-Requested-With': 'XMLHttpRequest',
    'Host': 'www.lagou.com',
    'Origin': 'https://www.lagou.com',
    'Referer': 'https://www.lagou.com/jobs/list_Python?px=default&city=%E4%B8%8A%E6%B5%B7',
    'Cookie': 'user_trace_token=20161218185535-85cdc6012c1d4a6082a3b43948bc022a; LGUID=20161218185535-81241f51-c510-11e6-b712-525400f775ce; index_location_city=%E5%85%A8%E5%9B%BD; JSESSIONID=3582EF81F53617D01B712CB544CEF861; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1482058535,1483885110,1484228501,1484399228; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1484399228; _gat=1; _ga=GA1.2.318620386.1482058535; LGSID=20170114210707-5a28bd28-da5a-11e6-ab23-525400f775ce; PRE_UTM=; PRE_HOST=; PRE_SITE=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; LGRID=20170114210707-5a28bf90-da5a-11e6-ab23-525400f775ce'
}

def fetcher_worker(queue):
    '''
    to get position infomation
    :param queue: a queue contains url
    :return:
    '''
    while True:
        url = queue.get()
        page_size = get_max_page(url)
        if page_size < 0:
            FAILED_URL_LIST.append(url)
        else:
            for index in xrange(1, page_size+1):
                data = {'first': False, 'pn': index, 'kd': 'Python'}
                res = requests.post(url, data=data, verify=False, headers=HEADERS)
                if res.status_code == 200:
                    result = res.json()
                    JSON_LIST.append(result['content']['positionResult']['result'])
                else:
                    print 'request error, status code: ', res.status_code
                #放慢访问请求速度，防止请求过于频繁被封ip
                time.sleep(3)
        queue.task_done()


def detail_fetcher_worker(queue):
    '''
    to get position detail infomation
    :param queue: a queue contains url
    :return:
    '''
    while True:
        url = queue.get()
        res = requests.get(url[0], verify=False, headers=HEADERS)
        if res.status_code == 200:
            DETAIL_HTML_LIST.append((res.text, url[1]))
        else:
            print 'request error, status code: ', res.status_code, url[0]
        time.sleep(3)
        queue.task_done()



#获取一个url需要请求的页数,小于零即为出错
def get_max_page(url):
    data = {'first': True, 'pn': 1, 'kd': 'Python'}
    res = requests.post(url, verify=False, data=data, headers=HEADERS)
    if res.status_code == 200:
        print res.text
        res_json = res.json()
        total_count = float(res_json['content']['positionResult']['totalCount'])
        result_size = float(res_json['content']['positionResult']['resultSize'])
        try:
            max_page = total_count/result_size
            #向上取整
            return int(math.ceil(max_page))
        except ZeroDivisionError:
            print 'Zero Division Error!'
            return -1
    else:
        print 'response status code: ', res.status_code
        return -1


def draw_word_cloud(text_list, pic_path, font_path, font_size):
    all_words = []
    for text in text_list:
        words = pseg.cut(text)
        for word, flag in words:
            if re.search('n', flag, re.I | re.M):
                all_words.append(word.decode('utf8'))

    unique_words = set(all_words)

    statistics_dic = {}
    for word in unique_words:
        statistics_dic[word] = all_words.count(word)

    for invalid_word in INVALID_WORD:
        if invalid_word.decode('utf8') in statistics_dic:
            statistics_dic.pop(invalid_word)

    to_del_word = []
    for key in statistics_dic:
        if len(key) <= 1:
            to_del_word.append(key)
    for word in to_del_word:
        statistics_dic.pop(word)

    statistics_list = []
    for k, v in statistics_dic.items():
        statistics_list.append((k, v))

    statistics_list.sort(key=lambda x: x[1], reverse=True)

    #用来用来做词云的图片
    mask = numpy.array(Image.open(pic_path))
    image_colors = ImageColorGenerator(mask)

    #font_path需要设置，不设置字体中文会显示不出来
    wc = WordCloud(background_color="white",
                   font_path=font_path,
                   mask=mask,
                   max_font_size=font_size,
                   random_state=10)
    wc.fit_words(statistics_list)
    plt.imshow(wc.recolor(color_func=image_colors))
    plt.axis('off')
    plt.show()


def main():
    # 将请求路径拼装好并放入队列中
    for city in WORK_CITY:
        URL_QUEUE.put('https://www.lagou.com/jobs/positionAjax.json?'
                      'px=default&city={0}&needAddtionalResult=false'.format(city))

    for index in xrange(5):
        thread = threading.Thread(target=fetcher_worker, args=(URL_QUEUE,))
        thread.start()

    URL_QUEUE.join()

    client = MongoClient('localhost', 27017)
    coll = client.laGou.python_position
    detail = client.laGou.position_detail

    for page in JSON_LIST:
        for position in page:
            document = {
                '_id': position['positionId'],
                'company': position['companyFullName'],
                'salary': position['salary'],
                'city': position['city'],
                'position': ','.join([position['positionName'], position['secondType']])
            }
            coll.insert_one(document)
            DETAIL_URL_QUEUE.put(('https://www.lagou.com/jobs/{0}.html'.format(position['positionId']), position['positionId']))

    for index in xrange(5):
        thread = threading.Thread(target=detail_fetcher_worker, args=(DETAIL_URL_QUEUE,))
        thread.start()

    DETAIL_URL_QUEUE.join()

    for html in DETAIL_HTML_LIST:
        soup = BeautifulSoup(html[0])
        soup.find(attrs={"class":""})
        # text = soup.find('dd', attrs={'class': 'job_bt'}).get_text()
        # document = {
        #     '_id': html[1],
        #     'detail': text
        # }
        # detail.insert_one(document);modify mysql command;

    # result = detail.find()
    # text_list = []
    # for i in result:
    #     text_list.append(i['detail'].encode('utf8'))
    # draw_word_cloud(text_list, 'x.png', 'font.ttf', 25)


if __name__ == '__main__':
    main()
